#setwd('/afs/inf.ed.ac.uk/user/s17/s1725186/Documents/PhD-Models/FirstPUModel/RMarkdowns')

library(tidyverse) ; library(reshape2) ; library(glue) ; library(plotly) ; library(dendextend)
library(RColorBrewer) ; library(viridis) ; require(gridExtra) ; library(GGally)
library(WGCNA)
library(expss)
library(polycor)
library(knitr)

SFARI_colour_hue = function(r) {
  pal = c('#FF7631','#FFB100','#E8E328','#8CC83F','#62CCA6','#59B9C9','#b3b3b3','#808080','gray','#d9d9d9')[r]
}

clustering_selected = 'DynamicHybrid'

print(paste0('Using clustering ', clustering_selected))
## [1] "Using clustering DynamicHybrid"

Load preprocessed dataset (preprocessing code in 19_10_14_data_preprocessing.Rmd) and clustering (pipeline in 19_10_15_WGCNA.Rmd)

# Gandal dataset
load('./../Data/Gandal/preprocessed_data.RData')
datExpr = datExpr %>% data.frame
DE_info = DE_info %>% data.frame


# GO Neuronal annotations
GO_annotations = read.csv('./../../../PhD-InitialExperiments/FirstYearReview/Data/GO_annotations/genes_GO_annotations.csv')
GO_neuronal = GO_annotations %>% filter(grepl('neuron', go_term)) %>% 
              mutate('ID'=as.character(ensembl_gene_id)) %>% 
              dplyr::select(-ensembl_gene_id) %>% distinct(ID) %>%
              mutate('Neuronal'=1)


# SFARI Genes
SFARI_genes = read_csv('./../Data/SFARI/SFARI_genes_08-29-2019_with_ensembl_IDs.csv')
SFARI_genes = SFARI_genes[!duplicated(SFARI_genes$ID) & !is.na(SFARI_genes$ID),]


# Update DE_info with SFARI and Neuronal information
genes_info = DE_info %>% mutate('ID'=rownames(.)) %>% left_join(SFARI_genes, by='ID') %>% 
  mutate(`gene-score`=ifelse(is.na(`gene-score`), 'None', `gene-score`)) %>%
  left_join(GO_neuronal, by='ID') %>% 
  mutate(Neuronal=ifelse(is.na(Neuronal), 0, Neuronal)) %>%
  mutate(gene.score=ifelse(`gene-score`=='None' & Neuronal==1, 'Neuronal', `gene-score`), 
         significant=padj<0.05 & !is.na(padj))


# Dataset created with DynamicTreeMerged algorithm
dataset = read.csv(paste0('./../Data/Gandal/dataset_', clustering_selected, '.csv'))
dataset$Module = dataset[,clustering_selected]


# GO DEA
# load('./../Data/Gandal/GO_DE_clusters.RData')

rm(DE_info, GO_annotations, clusterings)
print(paste0('There are ', length(unique(dataset$Module)),' modules:'))
## [1] "There are 62 modules:"
dataset$Module %>% table %>% sort(decreasing=TRUE) %>% print
## .
## #00C0BF #8DAB00 #F066EA #9AA800 #F07F4A #6B9AFF #E96AF1 #FE61D0 #8594FF 
##    1298    1277    1158    1149    1086     639     614     566     441 
## #00B2F3 #FF67A3 #00BF78 #00C085 #00C092 #E58702 #D675FD #00C19E #00C1AA 
##     395     366     328     327     312     295     291     277     259 
## #CA9700 #00B9E4 #D19300 #E06FF8 #D88F00 #9B8EFF #43B500 #00B6EC #FF6A97 
##     259     254     247     239     217     215     205     197     182 
##    gray #6FB000 #AFA200 #FE6E8A #C29B00 #A5A500 #00AAFE #00AEF9 #AD87FF 
##     178     176     175     168     166     160     158     158     131 
## #FB727C #00BC59 #00B930 #FA62D9 #00BE69 #DF8B00 #F47A5D #CA7BFF #00BDD3 
##     128     125     123     120     115     114     106      99      97 
## #48A0FF #00BEC9 #00C0B5 #13B700 #F8766D #BD81FF #B99E00 #00BBDC #00BB47 
##      97      95      94      84      82      72      62      57      52 
## #F663E2 #FF61C5 #7FAE00 #FF62BB #EB8332 #FF64AF #00A5FF #5CB300 
##      45      45      42      41      40      38      32      32

Module-Trait associations

datTraits = datMeta %>% dplyr::select(Diagnosis_, Region, Sex, Age, PMI, RNAExtractionBatch) %>%
            rename('Diagnosis' = Diagnosis_, 'ExtractionBatch' = RNAExtractionBatch)

# Recalculate MEs with color labels
ME_object = datExpr %>% t %>% moduleEigengenes(colors = dataset$Module)
MEs = orderMEs(ME_object$eigengenes)

# Calculate correlation between eigengenes and the traits and their p-values
moduleTraitCor = MEs %>% apply(2, function(x) hetcor(x, datTraits, ML=TRUE)$correlations[1,-1]) %>% t
rownames(moduleTraitCor) = colnames(MEs)
colnames(moduleTraitCor) = colnames(datTraits)
moduleTraitPvalue = corPvalueStudent(moduleTraitCor, nrow(datExpr))

# In case there are any NAs
if(sum(!complete.cases(moduleTraitCor))>0){
  print(paste0(sum(is.na(moduleTraitCor)),' correlation(s) could not be calculated')) 
}
## [1] "1 correlation(s) could not be calculated"
moduleTraitCor = moduleTraitCor[complete.cases(moduleTraitCor),]
moduleTraitPvalue = moduleTraitPvalue[complete.cases(moduleTraitCor),]

# Sort moduleTraitCor by Diagnosis
moduleTraitCor = moduleTraitCor[order(moduleTraitCor[,1], decreasing=TRUE),]
moduleTraitPvalue = moduleTraitPvalue[order(moduleTraitCor[,1], decreasing=TRUE),]

# Create text matrix for the Heatmap
textMatrix = paste0(signif(moduleTraitCor, 2), ' (', signif(moduleTraitPvalue, 1), ')')
dim(textMatrix) = dim(moduleTraitCor)


labeledHeatmap(Matrix = moduleTraitCor, xLabels = names(datTraits), yLabels =  gsub('ME','',rownames(moduleTraitCor)), 
               yColorWidth=0, colors = brewer.pal(11,'PiYG'), bg.lab.y = gsub('ME','',rownames(moduleTraitCor)),
               textMatrix = textMatrix, setStdMargins = FALSE, cex.text = 0.8, cex.lab.y = 0.75, zlim = c(-1,1),
               main = paste('Module-Trait relationships'))

rm(ME_object, datTraits, MEs, moduleTraitCor, moduleTraitPvalue, textMatrix)

Selecting the three modules with highest (absolute) correlation to Diagnosis: #D675FD, #00BF78 and #13B700

pca = datExpr %>% prcomp

plot_data = data.frame('ID'=rownames(datExpr), 'PC1' = pca$x[,1], 'PC2' = pca$x[,2]) %>%
            left_join(dataset, by='ID') %>% dplyr::select(ID, PC1, PC2, Module, gene.score) %>%
            mutate(ImportantModules = ifelse(Module=='#D675FD', '#D675FD',
                                      ifelse(Module=='#00BF78', '#00BF78',
                                      ifelse(Module=='#13B700', '#13B700', 'Others')))) %>%
            mutate(color = ifelse(ImportantModules=='Others','gray',ImportantModules),
                   alpha = ifelse(ImportantModules=='Others', 0.1, 0.5))

table(plot_data$ImportantModules)
## 
## #00BF78 #13B700 #D675FD  Others 
##     328      84     291   15897
ggplotly(plot_data %>% ggplot(aes(PC1, PC2, color=ImportantModules)) + 
         geom_point(alpha=plot_data$alpha, color=plot_data$color) + theme_minimal() + 
           ggtitle('Modules with strongest relation to Diagnosis'))

Gene Significance

Comparing this with the DEA plot, Gene significance close to 0 means the gene expression is not affected by Diagnosis and negative values means the gene is underexpressed in Autism samples. So the important thing about gene significance is its absolute value

plot_data = data.frame('ID'=rownames(datExpr), 'PC1' = pca$x[,1], 'PC2' = pca$x[,2]) %>%
            left_join(dataset, by='ID')

ggplotly(plot_data %>% ggplot(aes(PC1, PC2, color=GS)) + geom_point(alpha=0.5) + 
         scale_color_gradientn(colours=c('#F8766D','white','#00BFC4')) +
  theme_minimal() + ggtitle('Gene Significance'))

Module Membership and Gene Significance relation

Genes from the module #D675FD

Module with the highest (absolute) correlation to Diagnosis (-0.93)

plot_data = dataset %>% dplyr::select(MM.D675FD, GS, gene.score) %>% filter(dataset$Module=='#D675FD')

ggplotly(plot_data %>% ggplot(aes(MM.D675FD, GS, color=gene.score)) + geom_point(alpha=0.5) + 
         scale_color_manual(values=SFARI_colour_hue(r=c(1:6,8,7))) + 
         theme_minimal())

Genes from the module #00BF78

Module with the second highest (absolute) correlation to Diagnosis (0.91)

plot_data = dataset %>% dplyr::select(MM.00BF78, GS, gene.score) %>% filter(dataset$Module=='#00BF78')

ggplotly(plot_data %>% ggplot(aes(MM.00BF78, GS, color=gene.score)) + geom_point(alpha=0.5) + 
         scale_color_manual(values=SFARI_colour_hue(r=c(1:6,8,7))) + 
         theme_minimal())

Genes from the module #13B700

Module with the third highest (absolute) correlation to Diagnosis and the top one with positive correlation (0.91)

plot_data = dataset %>% dplyr::select(MM.13B700, GS, gene.score) %>% filter(dataset$Module=='#13B700')


ggplotly(plot_data %>% ggplot(aes(MM.13B700, GS, color=gene.score)) + geom_point(alpha=0.5) + 
         scale_color_manual(values=SFARI_colour_hue(r=c(2:6,8,7))) + 
         theme_minimal())

Identifying important genes

Selecting the modules with the highest correlation to Diagnosis, and, from them, the genes with the highest module membership-(absolute) gene significance

*Ordered by \(\frac{MM+GS}{2}\)

Only three SFARI Genes in the three lists …

top_genes_1 = dataset %>% dplyr::select(ID, MM.D675FD, GS, gene.score) %>% filter(dataset$Module=='#D675FD') %>%
              rename('MM' = 'MM.D675FD') %>% mutate(importance = (MM+abs(GS))/2) %>% arrange(by=-importance) %>%
              top_n(10)

kable(top_genes_1, caption='Top 10 genes for module #D675FD')
Top 10 genes for module #D675FD
ID MM GS gene.score importance
ENSG00000153046 0.7905592 0.3254951 None 0.5580272
ENSG00000160908 0.7240197 0.3513986 None 0.5377092
ENSG00000159256 0.7536605 0.3007044 None 0.5271825
ENSG00000089177 0.7058893 0.3184176 None 0.5121535
ENSG00000180376 0.6530201 -0.3474614 None 0.5002408
ENSG00000184863 0.6611401 -0.3284024 None 0.4947713
ENSG00000079841 0.8338278 -0.1502685 2 0.4920481
ENSG00000244754 0.8556961 0.1265332 None 0.4911146
ENSG00000091428 0.8103045 -0.1444815 4 0.4773930
ENSG00000112701 0.8783071 0.0739720 None 0.4761395
top_genes_2 = dataset %>% dplyr::select(ID, MM.00BF78, GS, gene.score) %>% filter(dataset$Module=='#00BF78') %>%
              rename('MM' = 'MM.00BF78') %>% mutate(importance = (MM+abs(GS))/2) %>% arrange(by=-importance) %>%
              top_n(10)

kable(top_genes_2, caption='Top 10 genes for module #00BF78')
Top 10 genes for module #00BF78
ID MM GS gene.score importance
ENSG00000147419 0.8060082 -0.9461082 None 0.8760582
ENSG00000134108 0.8092190 -0.8920351 None 0.8506270
ENSG00000150768 0.8523198 -0.8412765 None 0.8467981
ENSG00000134440 0.8198250 -0.8361057 None 0.8279653
ENSG00000134265 0.7778789 -0.8478985 None 0.8128887
ENSG00000100934 0.7329018 -0.8317850 None 0.7823434
ENSG00000075303 0.7528216 -0.7943756 None 0.7735986
ENSG00000091140 0.7705895 -0.7718465 None 0.7712180
ENSG00000138138 0.7461832 -0.7841764 None 0.7651798
ENSG00000213585 0.7658138 -0.7458718 None 0.7558428
top_genes_3 = dataset %>% dplyr::select(ID, MM.13B700, GS, gene.score) %>% filter(dataset$Module=='#13B700') %>%
              rename('MM' = 'MM.13B700') %>% mutate(importance = (MM+abs(GS))/2) %>% arrange(by=-importance) %>%
              top_n(10)

kable(top_genes_3, caption='Top 10 genes for module #13B700')
Top 10 genes for module #13B700
ID MM GS gene.score importance
ENSG00000164209 0.8630768 -0.6128660 None 0.7379714
ENSG00000075790 0.7585227 -0.6298825 None 0.6942026
ENSG00000196950 0.7729037 -0.6077678 None 0.6903357
ENSG00000108946 0.8154412 -0.5593705 None 0.6874058
ENSG00000004897 0.7401680 -0.6063262 None 0.6732471
ENSG00000100614 0.7651184 -0.5325986 None 0.6488585
ENSG00000107679 0.6441001 -0.6004142 None 0.6222572
ENSG00000009844 0.7553578 -0.4852143 None 0.6202860
ENSG00000116918 0.6690866 -0.5708086 None 0.6199476
ENSG00000165832 0.7841533 -0.4526986 None 0.6184260
plot_data = data.frame('ID'=rownames(datExpr), 'PC1' = pca$x[,1], 'PC2' = pca$x[,2]) %>%
            left_join(dataset, by='ID') %>% dplyr::select(ID, PC1, PC2, Module, gene.score) %>%
            mutate(color = ifelse(Module=='#D675FD', '#D675FD',
                           ifelse(Module=='#00BF78', '#00BF78',
                           ifelse(Module=='#13B700', '#13B700', 'gray')))) %>%
            mutate(alpha = ifelse(color %in% c('#D675FD', '#00BF78', '#13B700') & 
                                  ID %in% c(as.character(top_genes_1$ID), 
                                            as.character(top_genes_2$ID),
                                            as.character(top_genes_3$ID)), 1, 0.1))

plot_data %>% ggplot(aes(PC1, PC2)) + geom_point(alpha=plot_data$alpha, color=plot_data$color) + 
              theme_minimal() + ggtitle('Important genes identified through WGCNA')


SFARI genes and Module-Diagnosis correlation

Modules with the strongest module-diagnosis correlation should have the highest percentage of SFARI Genes, but the plot shows this relation in modules with correlations close to 0, and the behaviour is the same for the other clusterings I have tried…

A good thing is that the gray cluster (which is actually all the genes that were left without a cluster) has a low correlation with ASD and also a low percentage of SFARI genes

plot_data = dataset %>% mutate('hasSFARIscore' = gene.score!='None') %>% 
            group_by(Module, MTcor, hasSFARIscore) %>% summarise(p=n()) %>% 
            left_join(dataset %>% group_by(Module) %>% summarise(n=n()), by='Module') %>% 
            mutate(p=round(p/n*100,2)) 

for(i in 1:nrow(plot_data)){
  this_row = plot_data[i,]
  if(this_row$hasSFARIscore==FALSE & this_row$p==100){
    new_row = this_row
    new_row$hasSFARIscore = TRUE
    new_row$p = 0
    plot_data = plot_data %>% rbind(new_row)
  }
}

plot_data = plot_data %>% filter(hasSFARIscore==TRUE)

ggplotly(plot_data %>% ggplot(aes(MTcor, p, size=n)) + geom_smooth(color='gray', se=FALSE) +
         geom_point(color=plot_data$Module, aes(id=Module)) +
         xlab('Module-Diagnosis correlation') + ylab('% of SFARI genes') +
         theme_minimal() + theme(legend.position = 'none'))
rm(i, this_row, new_row)